Christopher Boomhower1, Stacey Fabricant2, Alex Frye1, David Mumford2, Michael Smith1, Lindsay Vitovsky1
1 Southern Methodis Univeristy, Dallas, TX, US 2 Penn Mutual Life Insurance Co, Horsham PA </i></b>
To begin our analysis, we need to load the data from our 89 source .txt files. Data is separated into two separate groups of files; Separation and Non-Separation, thus data is loaded in two separate phases, then unioned together. Once data is loaded, Steps taken to remove non-US observations or those with no specified occupation, no specified salary, or no specified length of service level. Of a total 8,423,336 observations, we end with 8,232,375 after removal of these observations.
## Import libraries
import pickle
import os
import psutil
import glob
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import requests
import json
import missingno as msno
import prettytable
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import ShuffleSplit
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from datetime import datetime
from dateutil.parser import parse
from itertools import cycle
from sklearn import metrics as mt
import itertools
## Library Options
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
## Pre-defined Functions for use later
def pickleObject(objectname, filename, filepath = "PickleJar/"):
fullpicklepath = "{0}{1}.pkl".format(filepath, filename)
# Create a variable to pickle and open it in write mode
picklefile = open(fullpicklepath, 'wb')
pickle.dump(objectname, picklefile)
picklefile.close()
def unpickleObject(filename, filepath = "PickleJar/"):
fullunpicklepath = "{0}{1}.pkl".format(filepath, filename)
# Create an variable to pickle and open it in write mode
unpicklefile = open(fullunpicklepath, 'rb')
unpickleObject = pickle.load(unpicklefile)
unpicklefile.close()
return unpickleObject
def clear_display():
from IPython.display import clear_output
## Pre-defined variables for use later
dataOPMPath = "dataOPM"
dataEMPPath = "dataEMP"
PickleJarPath = "PickleJar"
%%time
## Load OPMSeparation Files
OPMDataFiles = glob.glob(os.path.join(dataOPMPath, "*.txt"))
for i in range(0,len(OPMDataFiles)):
OPMDataFiles[i] = OPMDataFiles[i].replace("\\","/")
OPMDataList = []
for i,j in zip(OPMDataFiles,range(0,len(OPMDataFiles))):
OPMDataList.append(pd.read_csv(i, dtype = 'str'))
display(OPMDataList[j].head())
## Load the SEPDATA_FY2015 file into it's own object
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/SEPDATA_FY2015.txt']
OPMDataOrig = OPMDataList[indexes[0]]
%%time
#print(OPMDataFiles)
print(len(OPMDataOrig))
##### Merge / Modify Codes / Aggregate Attributes to be more descriptive per the metadata files
OPMDataMerged = OPMDataOrig.copy()
##AGYSUB - AGYTYP, AGY
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTagy.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'AGYSUB', how = 'left')
##EFDate - quarter, month
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTefdate.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'EFDATE', how = 'left')
##AGELVL - AGELVLT
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTagelvl.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'AGELVL', how = 'left')
##LOSLVL - LOSLVLT
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTloslvl.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'LOSLVL', how = 'left')
##LOC - LocTypeT, LocT
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTloc.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'LOC', how = 'left')
##OCC - OCCTYPT, OCCFAM
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTocc.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'OCC', how = 'left')
##PATCO - PATCOT
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTpatco.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'PATCO', how = 'left')
##PPGRD - PayPlan, PPGroup, PPTYP
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTppgrd.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'PPGRD', how = 'left')
##SALLVL - SALLVLT
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTsallvl.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'SALLVL', how = 'left')
##TOA - TOATYP
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTtoa.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'TOA', how = 'left')
##WORKSCH - WSTYPT
indexes = [i for i,x in enumerate(OPMDataFiles) if x == 'dataOPM/DTwrksch.txt']
OPMDataMerged = OPMDataMerged.merge(OPMDataList[indexes[0]], on = 'WORKSCH', how = 'left')
## Modify Data Types for numeric objects
OPMDataMerged["SALARY"] = OPMDataMerged["SALARY"].apply(pd.to_numeric)
OPMDataMerged["COUNT"] = OPMDataMerged["COUNT"].apply(pd.to_numeric)
OPMDataMerged["LOS"] = OPMDataMerged["LOS"].apply(pd.to_numeric)
print("Original SEP data size of: "+str(len(OPMDataMerged)))
print("Removing "+str(len(OPMDataMerged[OPMDataMerged["LOCTYP"] != "1"]))+" Non-US observations.")
## Remove Non-US Data
OPMDataMerged = OPMDataMerged[OPMDataMerged["LOCTYP"] == "1"]
print("Removing "+str(len(OPMDataMerged[OPMDataMerged["OCCTYP"] == "3"]))+" observations with no specified Occupation.")
## Remove Observations with no specified occupation
OPMDataMerged = OPMDataMerged[OPMDataMerged["OCCTYP"] != "3"]
print("Removing "+str(len(OPMDataMerged[OPMDataMerged["SALLVL"] == "Z"]))+" observations with no specified Salary.")
## Remove Observations with no specified salary
OPMDataMerged = OPMDataMerged[OPMDataMerged["SALLVL"] != "Z"]
print("Removing "+str(len(OPMDataMerged[OPMDataMerged["LOSLVL"] == "Z"]))+" observations with no specified Length of Service.")
## Remove Observations with no specified LOSLVL
OPMDataMerged = OPMDataMerged[OPMDataMerged["LOSLVL"] != "Z"]
print("Removing "+str(len(OPMDataMerged[OPMDataMerged["AGELVL"] == "A"]))+" observations of Age Level A")
## Remove Observations from Age Level A (less than 20 years old)
OPMDataMerged = OPMDataMerged[OPMDataMerged["AGELVL"] != "A"]
print("Removing "+str(len(OPMDataMerged[OPMDataMerged["AGELVL"] == "Z"]))+" observations with no specified Age Level.")
## Remove Observations with no specified Age Level
OPMDataMerged = OPMDataMerged[OPMDataMerged["AGELVL"] != "Z"]
## Fix differences in spaces on WORKSCHT Column
OPMDataMerged["WORKSCHT"] = np.where(OPMDataMerged["WORKSCHT"].str[0]=="F", 'Full-time Nonseasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="I", 'Intermittent Nonseasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="P", 'Part-time Nonseasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="G", 'Full-time Seasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="J", 'Intermittent Seasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="Q", 'Part-time Seasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="T", 'Part-time Job Sharer Seasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="S", 'Part-time Job Sharer Nonseasonal',
np.where(OPMDataMerged["WORKSCHT"].str[0]=="B", 'Full-time Nonseasonal Baylor Plan',
'NO WORK SCHEDULE REPORTED' ### ELSE case represents Night
)
)
)
)
)
)
)
)
)
display(OPMDataMerged.head())
print("New SEP data size of: "+str(len(OPMDataMerged)))
display(OPMDataMerged.describe().transpose())
#del OPMDataList,OPMDataFiles
%%time
if os.path.isfile(PickleJarPath+"/EMPDataOrig4Q.pkl"):
print("Found the File! Loading Pickle Now!")
EMPDataOrig4Q = unpickleObject("EMPDataOrig4Q")
else:
## Load EMPData Files
indexes = []
EMPDataFiles = []
EMPDataList = []
EMPDataOrig = []
for i,qtr in enumerate(["Q1", "Q2", "Q3", "Q4"]):
EMPDataFiles.append(glob.glob(os.path.join(dataEMPPath, qtr + "/*.txt")))
for j in range(0,len(EMPDataFiles[i])):
EMPDataFiles[i][j] = EMPDataFiles[i][j].replace("\\","/")
EMPDataList.append([])
for j,file in enumerate(EMPDataFiles[i]):
EMPDataList[i].append(pd.read_csv(file, dtype = 'str'))
if i == 0:
display(EMPDataList[i][j].head())
## Load the FactData files into it's own object
indexes.append([])
##[qtr][fileindex from EMPDataList]
indexes[i]=[j for j,x in enumerate(EMPDataFiles[i]) if dataEMPPath + '/' + qtr + '/FACTDATA' in x]
EMPDataOrig.append([])
EMPDataOrig[i] = pd.concat([EMPDataList[i][indexes[i][j]] for j in range(0,len(indexes[i]))])
EMPDataOrig[i]["QTR"] = str(i+1)
## modify data type for numerics
EMPDataOrig[i]["SALARY"] = EMPDataOrig[i]["SALARY"].str.replace(',', '').str.replace('$', '').str.replace(' ', '').apply(pd.to_numeric)
## Load Metadata
##AGYSUB - AGYTYP, AGY
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTagy.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'AGYSUB', how = 'left')
##AGELVL - AGELVLT
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTagelvl.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'AGELVL', how = 'left')
#LOSLVL - LOSLVLT
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTloslvl.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'LOSLVL', how = 'left')
EMPDataOrig[i]["LOS"] = EMPDataOrig[i]["LOS"].apply(pd.to_numeric)
##LOC - LocTypeT, LocT
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTloc.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'LOC', how = 'left')
##OCC - OCCTYPT, OCCFAM
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTocc.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'OCC', how = 'left')
##PATCO - PATCOT
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTpatco.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'PATCO', how = 'left')
##PPGRD - PayPlan, PPGroup, PPTYP
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTppgrd.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'PPGRD', how = 'left')
##SALLVL - SALLVLT
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTsallvl.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'SALLVL', how = 'left')
##TOA - TOATYP
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTtoa.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'TOA', how = 'left')
##WORKSCH - WSTYPT
ind2 = [i for i,x in enumerate(EMPDataFiles[i]) if x == dataEMPPath + '/' + qtr + '/DTwrksch.txt']
EMPDataOrig[i] = EMPDataOrig[i].merge(EMPDataList[i][ind2[0]], on = 'WORKSCH', how = 'left')
display(EMPDataOrig[i].head())
EMPDataOrig4Q = pd.concat([EMPDataOrig[j] for j in range(0,len(EMPDataOrig))])
print("Original EMP data size of: "+str(len(EMPDataOrig4Q)))
print("Removing "+str(len(EMPDataOrig4Q[EMPDataOrig4Q["LOCTYP"] != "1"]))+" Non-US observations.")
## Remove Non-US Data
EMPDataOrig4Q = EMPDataOrig4Q[EMPDataOrig4Q["LOCTYP"] == "1"]
print("Removing "+str(len(EMPDataOrig4Q[EMPDataOrig4Q["OCCTYP"] == "3"]))+" observations with no specified Occupation.")
## Remove Observations with no specified occupation
EMPDataOrig4Q = EMPDataOrig4Q[EMPDataOrig4Q["OCCTYP"] != "3"]
print("Removing "+str(len(EMPDataOrig4Q[EMPDataOrig4Q["SALLVL"] == "Z"]))+" observations with no specified Salary.")
## Remove Observations with no specified salary
EMPDataOrig4Q = EMPDataOrig4Q[EMPDataOrig4Q["SALLVL"] != "Z"]
print("Removing "+str(len(EMPDataOrig4Q[EMPDataOrig4Q["LOSLVL"] == "Z"]))+" observations with no specified Length of Service.")
## Remove Observations with no specified LOSLVL
EMPDataOrig4Q = EMPDataOrig4Q[EMPDataOrig4Q["LOSLVL"] != "Z"]
print("Removing "+str(len(EMPDataOrig4Q[EMPDataOrig4Q["AGELVL"] == "A"]))+" observations of Age Level A.")
## Remove Observations from Age Level A (less than 20 years old)
EMPDataOrig4Q = EMPDataOrig4Q[EMPDataOrig4Q["AGELVL"] != "A"]
print("Removing "+str(len(EMPDataOrig4Q[EMPDataOrig4Q["AGELVL"] == "Z"]))+" observations with no specified Age Level.")
## Remove Observations with no specified Age Level
EMPDataOrig4Q = EMPDataOrig4Q[EMPDataOrig4Q["AGELVL"] != "Z"]
## Fix differences in spaces on WORKSCHT Column
EMPDataOrig4Q["WORKSCHT"] = np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="F", 'Full-time Nonseasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="I", 'Intermittent Nonseasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="P", 'Part-time Nonseasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="G", 'Full-time Seasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="J", 'Intermittent Seasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="Q", 'Part-time Seasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="T", 'Part-time Job Sharer Seasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="S", 'Part-time Job Sharer Nonseasonal',
np.where(EMPDataOrig4Q["WORKSCHT"].str[0]=="B", 'Full-time Nonseasonal Baylor Plan',
'NO WORK SCHEDULE REPORTED' ### ELSE case represents Night
)
)
)
)
)
)
)
)
)
pickleObject(EMPDataOrig4Q, "EMPDataOrig4Q")
print("New EMP data size of: "+str(len(EMPDataOrig4Q)))
display(EMPDataOrig4Q.describe().transpose())
%matplotlib inline
#sns.boxplot(y = "SALARY", data = EMPDataOrig4Q)
With both our separation and non-separation data loaded, we calculate three new attributes through aggregation or calculation amongst various attributes.
1) SEP Count by Date & Occupation – total number of separations (of any type) for a given Date and Occupation;
2) SEP Count by Date & Location – total number of separations (of any type) for a given Date and Location;
3) Industry Average Salary – Average salary amongst non-separated employees, grouped by quarter, occupation, pay grade, and work schedule;
We proceed, by concatenating our Separation and Non-Separation observations, and merge these newly calculated attributes to the concatenated dataset.
%%time
%matplotlib inline
##Aggregate Number of Total Separations in current month for given Occ
AggSEPCount_EFDATE_OCC= pd.DataFrame({'SEPCount_EFDATE_OCC' : OPMDataMerged.groupby(["EFDATE", "OCC"]).size()}).reset_index()
display(AggSEPCount_EFDATE_OCC.head())
##Aggregate Number of Total Separations in current month for given LOC
AggSEPCount_EFDATE_LOC = pd.DataFrame({'SEPCount_EFDATE_LOC' : OPMDataMerged.groupby(["EFDATE", "LOC"]).size()}).reset_index()
display(AggSEPCount_EFDATE_LOC.head())
##Average Quarterly EMP Salary by occ
AggIndAvgSalary = pd.DataFrame({'count' : EMPDataOrig4Q.groupby(["QTR", "OCC", "PPGRD", "WORKSCHT"]).size()}).reset_index()
AggIndAvgSalary2 = pd.DataFrame({'IndSalarySum' : EMPDataOrig4Q.groupby(["QTR", "OCC", "PPGRD", "WORKSCHT"])["SALARY"].sum()}).reset_index()
AggIndAvgSalary = AggIndAvgSalary.merge(AggIndAvgSalary2,on=["QTR", "OCC", "PPGRD", "WORKSCHT"])
AggIndAvgSalary["IndAvgSalary"] = AggIndAvgSalary["IndSalarySum"]/AggIndAvgSalary["count"]
del AggIndAvgSalary["count"]
del AggIndAvgSalary["IndSalarySum"]
display(AggIndAvgSalary.head())
#Merge Two Datasets
### NS SEP code means NonSeparation
###add hardcoded null value columns where applicable
EMPDataOrig4Q["SEP"] = "NS"
EMPDataOrig4Q["GENDER"] = np.nan
EMPDataOrig4Q["COUNT"] = np.nan
OPMDataMerged["DATECODE"] = OPMDataMerged["EFDATE"]
OPMColList = ["AGYSUB", "SEP", "DATECODE", "AGELVL", "GENDER", "GSEGRD", "LOSLVL", "LOC", "OCC", "PATCO", "PPGRD", "SALLVL", "TOA", "WORKSCH", "COUNT", "SALARY", "LOS", "AGYTYP", "AGYTYPT", "AGY", "AGYT", "AGYSUBT", "QTR", "AGELVLT", "LOSLVLT", "LOCTYP", "LOCTYPT", "LOCT", "OCCTYP", "OCCTYPT", "OCCFAM", "OCCFAMT", "OCCT", "PATCOT", "PPTYP", "PPTYPT", "PPGROUP", "PPGROUPT", "PAYPLAN", "PAYPLANT", "SALLVLT", "TOATYP", "TOATYPT", "TOAT", "WSTYP", "WSTYPT", "WORKSCHT"]
EMPColList = ["AGYSUB", "SEP", "DATECODE", "AGELVL", "GENDER", "GSEGRD", "LOSLVL", "LOC", "OCC", "PATCO", "PPGRD", "SALLVL", "TOA", "WORKSCH", "COUNT", "SALARY", "LOS", "AGYTYP", "AGYTYPT", "AGY", "AGYT", "AGYSUBT", "QTR", "AGELVLT", "LOSLVLT", "LOCTYP", "LOCTYPT", "LOCT", "OCCTYP", "OCCTYPT", "OCCFAM", "OCCFAMT", "OCCT", "PATCOT", "PPTYP", "PPTYPT", "PPGROUP", "PPGROUPT", "PAYPLAN", "PAYPLANT", "SALLVLT", "TOATYP", "TOATYPT", "TOAT", "WSTYP", "WSTYPT", "WORKSCHT"]
OPMDataMerged = pd.concat([OPMDataMerged[OPMColList], EMPDataOrig4Q[EMPColList]], ignore_index=True)
print("Total concatenated data size for SEP and non-SEP: "+str(len(OPMDataMerged)))
OPMDataMerged = OPMDataMerged.merge(AggSEPCount_EFDATE_OCC, left_on = ['DATECODE','OCC'], right_on = ['EFDATE','OCC'], how = 'left')
OPMDataMerged = OPMDataMerged.merge(AggSEPCount_EFDATE_LOC, left_on = ['DATECODE','LOC'], right_on = ['EFDATE','LOC'], how = 'left')
OPMDataMerged = OPMDataMerged.merge(AggIndAvgSalary, on = ['QTR','OCC', 'PPGRD', 'WORKSCHT'], how = 'left')
OPMDataMerged["SalaryOverUnderIndAvg"] = OPMDataMerged["SALARY"] - OPMDataMerged["IndAvgSalary"]
del OPMDataMerged["EFDATE_x"]
del OPMDataMerged["EFDATE_y"]
display(OPMDataMerged.head())
display(OPMDataMerged.tail())
print(len(OPMDataMerged[OPMDataMerged["SEPCount_EFDATE_OCC"].isnull()]))
display(OPMDataMerged[OPMDataMerged["SEPCount_EFDATE_OCC"].isnull()][["SEP","DATECODE", "OCC"]].drop_duplicates())
These 50993 Non-Separation observations do not have coverage within the Separation Dataset, thus, we will remove these observations as out of scope demographic in our analysis. Any attempt in predicting these values will not have enough data to support a significant response.
OPMDataMerged = OPMDataMerged[OPMDataMerged["SEPCount_EFDATE_OCC"].notnull()]
print(len(OPMDataMerged[OPMDataMerged["SEPCount_EFDATE_OCC"].isnull()]))
print(len(OPMDataMerged))
print(len(OPMDataMerged[OPMDataMerged["SEPCount_EFDATE_LOC"].isnull()]))
display(OPMDataMerged[OPMDataMerged["SEPCount_EFDATE_LOC"].isnull()][["SEP","DATECODE","LOC"]].drop_duplicates())
print(len(OPMDataMerged[OPMDataMerged["IndAvgSalary"].isnull()]))
display(OPMDataMerged[OPMDataMerged["IndAvgSalary"].isnull()][["QTR", "SEP","OCCT", "PPGRD", "WORKSCHT"]].drop_duplicates())
These 1293 separation observations do not have coverage within the EMP Dataset, thus, we will remove these observations as out of scope demographic in our analysis. Any attempt in predicting these values will not have enough data to support a significant response.
OPMDataMerged = OPMDataMerged[OPMDataMerged["IndAvgSalary"].notnull()]
print(len(OPMDataMerged[OPMDataMerged["IndAvgSalary"].isnull()]))
print(len(OPMDataMerged))
# Placeholder Chunks for Data Quality check of salary against GS Grade Level Ranges
We are iterested to see how federal pension plans may impact attrition in this dataset. An interesting attribute to complement Length of service, is Years to Retirement. Utilizing a FERS retirement eligibility baseline of 57 years of age for all observations, and the lower limitation of age level ranges we compute a numeric value for length of retirement.
#Add Column YearsToRetirement
"""
AGELVL,AGELVLT
A,Less than 20
B,20-24
C,25-29
D,30-34
E,35-39
F,40-44
G,45-49
H,50-54
I,55-59
J,60-64
K,65 or more
Z,Unspecified
"""
OPMDataMerged["LowerLimitAge"] = np.where(OPMDataMerged["AGELVL"]=="B", 20,
np.where(OPMDataMerged["AGELVL"]=="C", 25,
np.where(OPMDataMerged["AGELVL"]=="D", 30,
np.where(OPMDataMerged["AGELVL"]=="E", 35,
np.where(OPMDataMerged["AGELVL"]=="F", 40,
np.where(OPMDataMerged["AGELVL"]=="G", 45,
np.where(OPMDataMerged["AGELVL"]=="H", 50,
np.where(OPMDataMerged["AGELVL"]=="I", 55,
np.where(OPMDataMerged["AGELVL"]=="J", 60,
np.where(OPMDataMerged["AGELVL"]=="K", 65,
np.nan
)
)
)
)
)
)
)
)
)
)
retAge = 57
OPMDataMerged["YearsToRetirement"] = np.where(OPMDataMerged["AGELVL"]=="B", retAge-20,
np.where(OPMDataMerged["AGELVL"]=="C", retAge-25,
np.where(OPMDataMerged["AGELVL"]=="D", retAge-30,
np.where(OPMDataMerged["AGELVL"]=="E", retAge-35,
np.where(OPMDataMerged["AGELVL"]=="F", retAge-40,
np.where(OPMDataMerged["AGELVL"]=="G", retAge-45,
np.where(OPMDataMerged["AGELVL"]=="H", retAge-50,
np.where(OPMDataMerged["AGELVL"]=="I", retAge-55,
np.where(OPMDataMerged["AGELVL"]=="J", retAge-60,
np.where(OPMDataMerged["AGELVL"]=="K", retAge-65,
np.nan
)
)
)
)
)
)
)
)
)
)
print("Null Values for LowerLimitAge: " + str(len(OPMDataMerged[OPMDataMerged["LowerLimitAge"].isnull()])))
print("Null Values for YearsToRetirement: " + str(len(OPMDataMerged[OPMDataMerged["YearsToRetirement"].isnull()])))
display(OPMDataMerged.head())
display(OPMDataMerged.tail())
In addition to the OPM data, we merge 10 attributes from the Bureau of Labor Statistics (BLS). Data is sourced from Federal Government industry codes across all regions. Although assumed to be highly correlated, we source both Level (Total number) and Rate (Percentage of Level to total employment and / or job openings) for the following statistics: 1) Job Openings, 2) Layoffs, 3) Quits, 4) Total Separations, and 5) Other Separations. While Rate paints an aggregated, holistic picture for job market trends, Level provides a raw count for total separations alone. Both these statistics were captured by a monthly aggregate and merged to the OPM data by their respective months.
%%time
def bls(series, start, end):
headers = {'Content-type': 'application/json'}
sID = []
for i in range(0,len(series)):
sID.append(series[i][0])
data = json.dumps({"seriesid": sID,
"startyear":start,
"endyear":end,
"catalog":False,
"calculations":False,
"annualaverage":False,
"registrationkey":"7a89c8d7979349fba8914b8be16a1646"})
p = requests.post('https://api.bls.gov/publicAPI/v2/timeseries/data/', data=data, headers=headers)
json_data = json.loads(p.text)
bls = []
for series in json_data['Results']['series']:
#x=prettytable.PrettyTable(["series id","year","period","value","footnotes"])
result = pd.DataFrame(columns=["series id","year","period","value","footnotes"])
seriesId = series['seriesID']
for item in series['data']:
year = item['year']
period = item['period']
value = item['value']
footnotes=""
for footnote in item['footnotes']:
if footnote:
footnotes = footnotes + footnote['text'] + ','
if 'M01' <= period <= 'M12':
#x.add_row([seriesId,year,period,value,footnotes[0:-1]])
y = pd.DataFrame({"series id" : seriesId,
"year" : year,
"period" : period,
"value" : value,
"footnotes" : footnotes}, index = [0])
result = result.append(y, ignore_index = True)
bls.append(result)
return(bls)
%%time
seriesList = [
['JTU91000000JOL','BLS_FEDERAL_JobOpenings_Level'],
['JTU91000000LDL','BLS_FEDERAL_Layoffs_Level'],
['JTU91000000OSL','BLS_FEDERAL_OtherSep_Level'],
['JTU91000000QUL','BLS_FEDERAL_Quits_Level'],
['JTU91000000TSL','BLS_FEDERAL_TotalSep_Level'],
['JTU91000000JOR','BLS_FEDERAL_JobOpenings_Rate'],
['JTU91000000LDR','BLS_FEDERAL_Layoffs_Rate'],
['JTU91000000OSR','BLS_FEDERAL_OtherSep_Rate'],
['JTU91000000QUR','BLS_FEDERAL_Quits_Rate'],
['JTU91000000TSR','BLS_FEDERAL_TotalSep_Rate']
]
# Pull job openings and labor turnover data
JTL = bls(seriesList, "2014", "2015")
seriesList = pd.DataFrame(seriesList, columns = ["series id","sName"])
##We need to replace these with actual Descriptor Column Names
for i in range(0,len(seriesList)):
JTL[i] = JTL[i].merge(seriesList, on = "series id", how = 'inner')
if len(JTL[i]) >0:
name = JTL[i]["sName"].drop_duplicates().values[0]
else:
name = str(i)
JTL[i][name] = JTL[i]["value"].apply(pd.to_numeric)
JTL[i]["DATECODE"] = JTL[i]["year"] + JTL[i]["period"].str[-2:]
del JTL[i]["value"]
del JTL[i]["year"]
del JTL[i]["period"]
del JTL[i]["series id"]
del JTL[i]["footnotes"]
del JTL[i]["sName"]
OPMDataMerged = OPMDataMerged.merge(JTL[i], on = "DATECODE", how = 'left')
display(JTL[i].head())
display(OPMDataMerged.head())
display(OPMDataMerged.tail())
display(pd.DataFrame({'StratCount' : OPMDataMerged.groupby(["SEP"]).size()}).reset_index())
There are several separation types we would like to either roll up, or remove altogether.
Roll-Up
We have chosen to roll up all retirement separation together. Separation categories of 1) SD,Retirement - Voluntary; 2) SE,Retirement - Early Out; 3) SF,Retirement - Disability; 4) SG,Retirement - Other are consolidated into one category "SD".
Removal
We have chosen to remove the following. 1) SB,Transfer Out - Mass Transfer; 2) SK,Death; 3) SL,Other Separation. 4) SJ,Termination (Expired Appt/Other)
OPMDataMerged = OPMDataMerged[(OPMDataMerged["SEP"] != "SB") & (OPMDataMerged["SEP"] != "SK") & (OPMDataMerged["SEP"] != "SL") & (OPMDataMerged["SEP"] != "SJ")]
OPMDataMerged.loc[(OPMDataMerged["SEP"] == "SD") | (OPMDataMerged["SEP"] == "SE") | (OPMDataMerged["SEP"] == "SF") | (OPMDataMerged["SEP"] == "SG"), "SEP"]="SD"
In terms of data exploration, we first investigate numeric type attributes. Relationships, distributions, and correlation values are reviewed.
A new binary separation attribute is created to indicate whether non-sep or sep for EDA correlation purposes
#%%time
#
#
#cols = list(SampledOPMData.select_dtypes(include=['float64', 'int64']))
#cols.remove('COUNT')
#cols.remove('BLS_FEDERAL_OtherSep_Rate')
#cols.remove('BLS_FEDERAL_Quits_Rate')
#cols.remove('BLS_FEDERAL_TotalSep_Level')
#cols.remove('BLS_FEDERAL_JobOpenings_Rate')
#cols.remove('BLS_FEDERAL_OtherSep_Level')
#cols.remove('BLS_FEDERAL_Quits_Level')
#cols.remove('BLS_FEDERAL_JobOpenings_Level')
#cols.remove('BLS_FEDERAL_Layoffs_Rate')
#cols.remove('BLS_FEDERAL_Layoffs_Level')
#cols.remove('BLS_FEDERAL_TotalSep_Rate')
#cols.append('SEP')
#display(cols)
#
#plotNumeric = SampledOPMData[cols]
#
## Create binary separation attribute for EDA correlation review
##plotNumeric["SEP_bin"] = plotNumeric.SEP.replace("NS", 1)
##plotNumeric.loc[plotNumeric['SEP_bin'] != 1, 'SEP_bin'] = 0
##plotNumeric.SEP_bin = plotNumeric.SEP_bin.apply(pd.to_numeric)
#AttSplit = pd.get_dummies(plotNumeric['SEP'],prefix='SEP')
#display(AttSplit.head())
#plotNumeric = pd.concat((plotNumeric,AttSplit),axis=1) # add back into the dataframe
#
#display(plotNumeric.head())
#print("plotNumeric has {0} Records".format(len(plotNumeric)))
##print(plotNumeric.SEP_bin.dtype)
%%time
cols = list(OPMDataMerged.select_dtypes(include=['float64', 'int64']))
cols.remove('COUNT')
cols.remove('BLS_FEDERAL_OtherSep_Rate')
cols.remove('BLS_FEDERAL_Quits_Rate')
cols.remove('BLS_FEDERAL_TotalSep_Level')
cols.remove('BLS_FEDERAL_JobOpenings_Rate')
cols.remove('BLS_FEDERAL_OtherSep_Level')
cols.remove('BLS_FEDERAL_Quits_Level')
cols.remove('BLS_FEDERAL_JobOpenings_Level')
cols.remove('BLS_FEDERAL_Layoffs_Rate')
cols.remove('BLS_FEDERAL_Layoffs_Level')
cols.remove('BLS_FEDERAL_TotalSep_Rate')
cols.append('SEP')
display(cols)
plotNumeric = OPMDataMerged[cols]
# Create binary separation attribute for EDA correlation review
#plotNumeric["SEP_bin"] = plotNumeric.SEP.replace("NS", 1)
#plotNumeric.loc[plotNumeric['SEP_bin'] != 1, 'SEP_bin'] = 0
#plotNumeric.SEP_bin = plotNumeric.SEP_bin.apply(pd.to_numeric)
AttSplit = pd.get_dummies(plotNumeric['SEP'],prefix='SEP')
display(AttSplit.head())
plotNumeric = pd.concat((plotNumeric,AttSplit),axis=1) # add back into the dataframe
display(plotNumeric.head())
print("plotNumeric has {0} Records".format(len(plotNumeric)))
#print(plotNumeric.SEP_bin.dtype)
%%time
sns.set(font_scale=1)
sns.pairplot(plotNumeric.drop(['SEP_NS',
'SEP_SA',
'SEP_SC',
'SEP_SD',
'SEP_SH',
'SEP_SI'], axis=1), hue = 'SEP', palette="hls", plot_kws={"s": 50})
%%time
# Function modified from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
sns.set()
def draw_histograms(df, variables, n_rows, n_cols):
fig=plt.figure(figsize=(20,20))
for i, var_name in enumerate(variables):
ax=fig.add_subplot(n_rows,n_cols,i+1)
df[var_name].hist(bins=20,ax=ax, color='#58D68D')
ax.set_title(var_name+" Distribution")
fig.tight_layout() # Improves appearance a bit.
plt.show()
draw_histograms(plotNumeric.drop(['SEP',
'SEP_NS',
'SEP_SA',
'SEP_SC',
'SEP_SD',
'SEP_SH',
'SEP_SI'], axis=1),
plotNumeric.drop(['SEP',
'SEP_NS',
'SEP_SA',
'SEP_SC',
'SEP_SD',
'SEP_SH',
'SEP_SI'], axis=1).columns, 6, 3)
%%time
# Inspired by http://seaborn.pydata.org/examples/many_pairwise_correlations.html
#plt.matshow(plotNumeric.corr())
sns.set(style='white')
corr = plotNumeric.drop(['SEP'], axis=1).corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask, k=1)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 20))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(250, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.set(font_scale=0.95)
heatCorr = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1,
square=True, annot=True, linewidths=1,
cbar_kws={"shrink": .5}, ax=ax, fmt='.1g')
#heatCorr.
ax.tick_params(labelsize=15)
cax = plt.gcf().axes[-1]
cax.tick_params(labelsize=15)
sns.plt.show()
#sns.heatmap(corr, annot=True, linewidths=0.01, cmap=cmap, ax=ax)
Based on the distribution of attributes identified above, we have decided to take the log transform of several attributes.
%%time
# Log Transform Columns Added
OPMDataMerged["SALARYLog"] = OPMDataMerged["SALARY"].apply(np.log)
OPMDataMerged["LOSLog"] = (OPMDataMerged["LOS"] + .00001).apply(np.log)
OPMDataMerged["SEPCount_EFDATE_OCCLog"] = OPMDataMerged["SEPCount_EFDATE_OCC"].apply(np.log)
OPMDataMerged["SEPCount_EFDATE_LOCLog"] = OPMDataMerged["SEPCount_EFDATE_LOC"].apply(np.log)
OPMDataMerged["IndAvgSalaryLog"] = OPMDataMerged["IndAvgSalary"].apply(np.log)
We next review categorical data to improve our understanding of factor levels.
#%%time
#
#cols = list(SampledOPMData.select_dtypes(include=['object']))
#dropCols = ["LOCTYP",
# "LOCTYPT",
# "OCCTYP",
# "OCCTYPT",
# "PPTYP",
# "PPTYPT",
# "AGYTYP",
# "OCCFAM",
# "PPGROUP",
# "PAYPLAN",
# "TOATYP",
# "WSTYP",
# "AGYSUBT",
# "AGELVL",
# "LOSLVL",
# "LOC",
# "OCC",
# "PATCO",
# "SALLVL",
# "TOA",
# "WORKSCH"]
#
#for i in dropCols:
# cols.remove(i)
#
#plotCat = SampledOPMData[cols]
#display(plotCat.head())
#print("plotCat Has {0} Records".format(len(plotCat)))
#print("Number of colums = ", len(cols))
%%time
cols = list(OPMDataMerged.select_dtypes(include=['object']))
dropCols = ["LOCTYP",
"LOCTYPT",
"OCCTYP",
"OCCTYPT",
"PPTYP",
"PPTYPT",
"AGYTYP",
"OCCFAM",
"PPGROUP",
"PAYPLAN",
"TOATYP",
"WSTYP",
"AGYSUBT",
"AGELVL",
"LOSLVL",
"LOC",
"OCC",
"PATCO",
"SALLVL",
"TOA",
"WORKSCH"]
for i in dropCols:
cols.remove(i)
plotCat = OPMDataMerged[cols]
display(plotCat.head())
print("plotCat Has {0} Records".format(len(plotCat)))
print("Number of colums = ", len(cols))
High seperation among following:
Similar separation distributions among males and females, except more terminations due to contract expiration among males
High termination due to expired appt/other among following:
Bimodal Quit distribution with outlier spike at GSEGRD 9:
Individual transfers highest among levels 11, 12, 13
Majority of distribution resides in GS values per the GSEGRD observations described above.... Are other PPGRD values of any significance? What are corporate grades all about?
Top three Agencies with separation:
High contract termination in:
While Veteran Affairs and Army both have many quits and many retirees, the Army has significantly more individual transfers (on par with retirements)
Most contract terminations in 1st and 4th quarters
Retirement peaks in 2nd quarter
Number of quits increases from one quarter to the next
*Bear in mind these are quarters from single year only so time-sensitive trends may not be applicable*
High termination due to expired appt/other among following:
Number of Quits peaks at AGELVL D
Individual transfer counts mostly trend with Quits
Retirement highest at following:
Highest Quit count for LOSLVL A (< 1 year service) which then declines for levels B and C before spiking again at level D (5-9 years service)
Same pattern is observed for contract terminations but without any significant spikes with longer service
Large individual transfer spike at LOSLVL D (5-9 years service)
Retirement starts at LOSLVL D but trends upward to J
Contract terminations comprise most California terminations among top total separation states
East Coast locations may possibly have most individual transfers, the most being in Washington DC
03xx-General Admin, clerical, and office svcs highest separation by far but indicates both high number of Quits and Retirements
Many quits in 06xx-Medical
04xx-Natural Resources again indicates high number of contract terminations
01xx-Social Science has even number of Quits and retirements
Results skewed by GS
Should model full time only
def subCountPlot(att1, att2, thresh):
counts = plotCat.groupby([att1, att2]).size().unstack(fill_value=0) # Get att1 sizes by att2
counts = pd.concat([counts,counts.sum(axis=1)], axis=1) # Calculate total for each att1 value and append total as new column
counts.rename(columns={0:"Total"}, inplace=True)
top = counts[counts["Total"] > thresh].index.tolist() # Obtain att1 values where total surpasses threshold
zoom = plotCat[plotCat[att1].isin(top)] # Subset data to only the top att1 values
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 10), sharey=False)
sns.countplot(y=att1, data=zoom, color="blue", ax=ax1); # Dark blue signifies zoomed data
sns.countplot(y=att1, data=zoom, hue=att2, palette="hls", ax=ax2);
def percBarPlot(att1, att2, numColors):
# Create count by att1 and att2
counts = plotCat.groupby([att1, att2]).size().unstack(fill_value=0) # Get att1 sizes by att2
counts = pd.concat([counts,counts.sum(axis=1)], axis=1) # Calculate total for each att1 value and append total as new column
counts.rename(columns={0:"Total"}, inplace=True)
#counts.drop('Total', axis=1).plot(kind='bar', stacked=True)
# create cmap from sns color palette
my_cmap = ListedColormap(sns.color_palette('hls', numColors).as_hex())
# Create and plot percentage by att1 and att2
nest1 = []
for i in counts.values:
nest2 = []
for j in i:
nest2.append(float(j/(i[len(i)-1:]))*100)
nest1.append(nest2)
perc = pd.DataFrame(nest1)
perc = perc.set_index(counts.index.values)
perc.columns = counts.columns
perc.drop('Total', axis=1).plot(kind='bar', stacked=True, ylim=(0,100), figsize={13,6}, title=att1+' Percentage Plot', colormap=my_cmap)
temp = cols[:4] # for quick visualization debug only; may delete once complete
%%time
for i in cols:
if i != 'SEP':
plt.figure(i) # Required to create new figure each loop rather than drawing over previous object
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 10), sharey=False)
sns.countplot(y=i, data=plotCat, color="lightblue", ax=ax1);
sns.countplot(y=i, data=plotCat, hue="SEP", palette="hls", ax=ax2);
if i == 'AGYSUB':
subCountPlot(i, 'SEP', 10000)
elif i == 'LOCT':
subCountPlot(i, 'SEP', 4000)
elif i == 'OCCT':
subCountPlot(i, 'SEP', 2000)
elif i == 'PPGRD':
subCountPlot(i, 'SEP', 6000)
elif i == 'AGYT':
subCountPlot(i, 'SEP', 3000)
%%time
for i in cols:
if i != 'SEP':
percBarPlot(i, 'SEP', len(plotCat.SEP.drop_duplicates()))
percBarPlot('GSEGRD', 'SALLVLT', len(plotCat.SALLVLT.drop_duplicates()))
percBarPlot('PATCOT', 'SALLVLT', len(plotCat.SALLVLT.drop_duplicates()))
%%time
sns.violinplot(x="PATCOT", y="SALARY", hue="GENDER", data=OPMDataMerged[OPMDataMerged.GENDER != 'Z'], split=True,
inner="quart", palette={"M": "b", "F": "pink"})
sns.despine(left=True)
%%time
sns.set(style="whitegrid", palette="pastel", color_codes=True)
# Draw a nested violinplot and split the violins for easier comparison
sns.violinplot(x="SEP", y="SALARY", hue="GENDER", data=OPMDataMerged[OPMDataMerged.GENDER != 'Z'], split=True,
inner="quart", palette={"M": "b", "F": "pink"})
sns.despine(left=True)
%%time
sns.factorplot(x="SEP", y="SALARY", hue="GENDER", col="PATCOT",
data=OPMDataMerged[OPMDataMerged.GENDER != 'Z'],
kind="violin", split=True, aspect=.4, size=10);
%%time
sns.factorplot(x="SEP", y="SALARY", col="PATCOT", data=OPMDataMerged,
kind="violin", split=True, aspect=.4, size=10, palette = "hls");
%%time
g = sns.PairGrid(data=OPMDataMerged,
x_vars=["SEP","PATCOT"],
y_vars=["SALARY", "LOS", "LowerLimitAge", "YearsToRetirement"],
aspect=1, size=10)
g.map(sns.violinplot, palette="pastel");
del(plotNumeric, plotCat)
After analyzing the above plots for our categorical data, we have decided to narrow our focus due to the large variability in the dataset. We take the below actions on our dataset:
In addition, we have opted to remove the below attributes for model generation:
Our goal is to limit our focus to Professional occupations, build a model, then test that generated model on the Administration segment of the population.
%%time
#Removing Attributes
cols = list(OPMDataMerged.columns)
dropCols = ["QTR",
"AGYTYP",
"AGYTYPT",
"AGY",
"AGYT",
"AGYSUB",
"AGYSUBT",
"GENDER",
"COUNT",
"PAYPLAN",
"PAYPLANT",
"PPGRD",
"LOSLVL",
"LOSLVLT",
"SALLVL",
"SALLVLT",
"OCC",
"OCCT"]
for i in dropCols:
if i in cols:
cols.remove(i)
OPMDataMerged = OPMDataMerged[cols]
# Keep only Full-time Nonseasonal observations
OPMDataMerged = OPMDataMerged[OPMDataMerged["WORKSCH"] == "F"]
#Remove the location US-SUPPRESSED (SEE DATA DEFINITIONS)
OPMDataMerged = OPMDataMerged[OPMDataMerged["LOC"] != "US"]
#Keep only General Schedele Grades above 7.
OPMDataMerged["GSEGRD"] = OPMDataMerged["GSEGRD"].apply(pd.to_numeric)
OPMDataMerged = OPMDataMerged[OPMDataMerged["GSEGRD"] >= 7]
#Focus model generation on White Collar Jobs only
OPMDataMerged = OPMDataMerged[OPMDataMerged["OCCTYP"] == "1"]
#Create a Training set for the Professional PATCO value, and a Testing set for Administration
OPMDataMergedProf = OPMDataMerged[OPMDataMerged["PATCO"] == "1"]
OPMDataMergedAdmin = OPMDataMerged[OPMDataMerged["PATCO"] == "2"]
display(OPMDataMergedProf.head())
print(len(OPMDataMergedProf))
display(OPMDataMergedAdmin.head())
print(len(OPMDataMergedAdmin))
#curious on stratum SEP counts for full remaining data
stratum = pd.DataFrame({'StratCount' : OPMDataMerged.groupby(["SEP"]).size()}).reset_index()
display(stratum)
#Assess Stratum SEP Counts for Prof, for use in sampling
maxSize=4000
stratumProf = pd.DataFrame({'StratCount' : OPMDataMergedProf.groupby(["SEP"]).size()}).reset_index()
stratumProf.loc[stratumProf["StratCount"]>maxSize,"StratCountSample"] = maxSize
stratumProf.loc[stratumProf["StratCount"]<=maxSize,"StratCountSample"] = stratumProf["StratCount"]
#else: stratum["StratCountSample"] = stratum["StratCount"]
display(stratumProf)
#Assess Stratum SEP Counts for Admin, for use in sampling
maxSize=4000
stratumAdmin = pd.DataFrame({'StratCount' : OPMDataMergedAdmin.groupby(["SEP"]).size()}).reset_index()
stratumAdmin.loc[stratumAdmin["StratCount"]>maxSize,"StratCountSample"] = maxSize
stratumAdmin.loc[stratumAdmin["StratCount"]<=maxSize,"StratCountSample"] = stratumAdmin["StratCount"]
#else: stratum["StratCountSample"] = stratum["StratCount"]
display(stratumAdmin)
%%time
def aggStratPop(stratum, OPMDataMerged):
AggStrat = []
for i in range(0,len(stratum)):
sep = stratum["SEP"].ix[i]
StratCountSample = stratum["StratCountSample"].ix[i]
print("Stratum Sample Size Calculations for SEP: {}".format(sep))
AggStrat.append(pd.DataFrame({'StratCount' : OPMDataMerged[OPMDataMerged["SEP"]==sep].groupby(["DATECODE", "AGELVL"]).size()}).reset_index())
AggStrat[i]["SEP"] = sep
AggStrat[i]["TotalCount"] = len(OPMDataMerged[OPMDataMerged["SEP"]==sep])
AggStrat[i]["p"] = AggStrat[i]["StratCount"] / AggStrat[i]["TotalCount"]
AggStrat[i]["StratCountSample"] = StratCountSample
AggStrat[i]["StratSampleSize"] = round(AggStrat[i]["p"] * StratCountSample).apply(int)
display(AggStrat[i].head())
print("totalStratumSampleSize: ", AggStrat[i]["StratSampleSize"].sum())
#print(len(AggStrat[i]))
return AggStrat
def SampleStrata(stratum, OPMDataMerged, FileName):
AggStrat = aggStratPop(stratum, OPMDataMerged)
SampledOPMStratumDataList = []
for i,StratSampleSize in enumerate(AggStrat):
SampledOPMStratumData = []
for j in range(0,len(StratSampleSize)):
SEP = StratSampleSize["SEP"].ix[j]
DATECODE = StratSampleSize["DATECODE"].ix[j]
AGELVL = StratSampleSize["AGELVL"].ix[j]
SampleSize = StratSampleSize["StratSampleSize"].ix[j]
print(SEP, DATECODE, AGELVL, SampleSize)
SampledOPMStratumDataList.append(OPMDataMerged[(OPMDataMerged["SEP"]==SEP)
& (OPMDataMerged["DATECODE"]==DATECODE)
& (OPMDataMerged["AGELVL"]==AGELVL)].sample(SampleSize, random_state=SampleSize))
SampledOPMStratumData.append(pd.concat(SampledOPMStratumDataList))
clear_display()
SampledOPMData = pd.concat(SampledOPMStratumData).reset_index()
del SampledOPMData["index"]
pickleObject(SampledOPMData, FileName)
clear_display()
return SampledOPMData
Using a seed value equal to each strata sample size, we take random samples according to the computed sizes above. We loop through each Separation Type's Aggregated Strata Sample Sizes; Identify all observations matching on Datecode, Separation Type, and AgeLevel; and finally sample those observations with the computed sample size.
%%time
##Prof Data Sampling
if os.path.isfile(PickleJarPath+"/SampledOPMDataProf.pkl"):
print("Found the File! Loading Pickle Now!")
SampledOPMDataProf = unpickleObject("SampledOPMDataProf")
else:
SampledOPMDataProf= SampleStrata(stratumProf, OPMDataMergedProf, "SampledOPMDataProf")
%%time
print(len(SampledOPMDataProf))
display(SampledOPMDataProf.head())
display(pd.DataFrame({'StratCount' : SampledOPMDataProf.groupby(["SEP"]).size()}).reset_index())
%%time
#### Analyze Missing Values
filtered_msnoData = msno.nullity_sort(msno.nullity_filter(SampledOPMDataProf, filter='bottom', n=15, p=0.999), sort='descending')
msno.matrix(filtered_msnoData)
del filtered_msnoData
%%time
##Admin Data Sampling
if os.path.isfile(PickleJarPath+"/SampledOPMDataAdmin.pkl"):
print("Found the File! Loading Pickle Now!")
SampledOPMDataAdmin = unpickleObject("SampledOPMDataAdmin")
else:
SampledOPMDataAdmin= SampleStrata(stratumAdmin, OPMDataMergedAdmin, "SampledOPMDataAdmin")
%%time
print(len(SampledOPMDataAdmin))
display(SampledOPMDataAdmin.head())
display(pd.DataFrame({'StratCount' : SampledOPMDataAdmin.groupby(["SEP"]).size()}).reset_index())
%%time
#### Analyze Missing Values
filtered_msnoData = msno.nullity_sort(msno.nullity_filter(SampledOPMDataAdmin, filter='bottom', n=15, p=0.999), sort='descending')
msno.matrix(filtered_msnoData)
del filtered_msnoData
%%time
## Describe Summary for our Model Professional Subgroup for Modeling
display(SampledOPMDataProf.describe().transpose())
#%%time
#OPMDataMerged.to_csv("OPMDataMerged.csv")
#os.path.getsize("OPMDataMerged.csv") #Display file size in bytes
Chris... can you use the SampledOPMDataProf dataset, and re-run the Visuals?
%%time
cols = list(SampledOPMDataProf.select_dtypes(include=['float64', 'int64']))
cols.remove('BLS_FEDERAL_OtherSep_Rate')
cols.remove('BLS_FEDERAL_Quits_Rate')
cols.remove('BLS_FEDERAL_TotalSep_Level')
cols.remove('BLS_FEDERAL_JobOpenings_Rate')
cols.remove('BLS_FEDERAL_OtherSep_Level')
cols.remove('BLS_FEDERAL_Quits_Level')
cols.remove('BLS_FEDERAL_JobOpenings_Level')
cols.remove('BLS_FEDERAL_Layoffs_Rate')
cols.remove('BLS_FEDERAL_Layoffs_Level')
cols.remove('BLS_FEDERAL_TotalSep_Rate')
cols.append('SEP')
display(cols)
plotNumeric = SampledOPMDataProf[cols]
# Create binary separation attribute for EDA correlation review
#plotNumeric["SEP_bin"] = plotNumeric.SEP.replace("NS", 1)
#plotNumeric.loc[plotNumeric['SEP_bin'] != 1, 'SEP_bin'] = 0
#plotNumeric.SEP_bin = plotNumeric.SEP_bin.apply(pd.to_numeric)
AttSplit = pd.get_dummies(plotNumeric['SEP'],prefix='SEP')
display(AttSplit.head())
plotNumeric = pd.concat((plotNumeric,AttSplit),axis=1) # add back into the dataframe
display(plotNumeric.head())
print("plotNumeric has {0} Records".format(len(plotNumeric)))
#print(plotNumeric.SEP_bin.dtype)
%%time
sns.set(font_scale=1)
sns.pairplot(plotNumeric.drop(['SEP_NS',
'SEP_SA',
'SEP_SC',
'SEP_SD',
'SEP_SH',
'SEP_SI'], axis=1), hue = 'SEP', palette="hls", plot_kws={"s": 50})
%%time
# Function modified from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
sns.set()
def draw_histograms(df, variables, n_rows, n_cols):
fig=plt.figure(figsize=(20,20))
for i, var_name in enumerate(variables):
ax=fig.add_subplot(n_rows,n_cols,i+1)
df[var_name].hist(bins=20,ax=ax, color='#58D68D')
ax.set_title(var_name+" Distribution")
fig.tight_layout() # Improves appearance a bit.
plt.show()
draw_histograms(plotNumeric.drop(['SEP',
'SEP_NS',
'SEP_SA',
'SEP_SC',
'SEP_SD',
'SEP_SH',
'SEP_SI'], axis=1),
plotNumeric.drop(['SEP',
'SEP_NS',
'SEP_SA',
'SEP_SC',
'SEP_SD',
'SEP_SH',
'SEP_SI'], axis=1).columns, 6, 3)
%%time
# Inspired by http://seaborn.pydata.org/examples/many_pairwise_correlations.html
#plt.matshow(plotNumeric.corr())
sns.set(style='white')
corr = plotNumeric.drop(['SEP'], axis=1).corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask, k=1)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 20))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(250, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.set(font_scale=0.95)
heatCorr = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1,
square=True, annot=True, linewidths=1,
cbar_kws={"shrink": .5}, ax=ax, fmt='.1g')
#heatCorr.
ax.tick_params(labelsize=15)
cax = plt.gcf().axes[-1]
cax.tick_params(labelsize=15)
sns.plt.show()
#sns.heatmap(corr, annot=True, linewidths=0.01, cmap=cmap, ax=ax)
%%time
cols = list(SampledOPMDataProf.select_dtypes(include=['object']))
dropCols = ["LOCTYP",
"LOCTYPT",
"OCCTYP",
"OCCTYPT",
"PPTYP",
"PPTYPT",
"AGYTYP",
"OCCFAM",
"PPGROUP",
"PAYPLAN",
"TOATYP",
"WSTYP",
"AGYSUBT",
"AGELVL",
"LOSLVL",
"LOC",
"OCC",
"PATCO",
"SALLVL",
"TOA",
"WORKSCH"]
for i in dropCols:
if(i in list(SampledOPMDataProf.columns)): cols.remove(i)
plotCat = SampledOPMDataProf[cols]
display(plotCat.head())
print("plotCat Has {0} Records".format(len(plotCat)))
print("Number of colums = ", len(cols))
%%time
for i in cols:
if i != 'SEP':
plt.figure(i) # Required to create new figure each loop rather than drawing over previous object
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 10), sharey=False)
sns.countplot(y=i, data=plotCat, color="lightblue", ax=ax1);
sns.countplot(y=i, data=plotCat, hue="SEP", palette="hls", ax=ax2);
if i == 'AGYSUB':
subCountPlot(i, 'SEP', 10000)
elif i == 'LOCT':
subCountPlot(i, 'SEP', 1000)
elif i == 'OCCT':
subCountPlot(i, 'SEP', 2000)
elif i == 'PPGRD':
subCountPlot(i, 'SEP', 6000)
elif i == 'AGYT':
subCountPlot(i, 'SEP', 3000)
%%time
for i in cols:
if i != 'SEP':
percBarPlot(i, 'SEP', len(plotCat.SEP.drop_duplicates()))
%%time
sns.set(style="whitegrid", palette="pastel", color_codes=True)
sns.violinplot(x="PATCOT", y="SALARY", data=SampledOPMDataProf, split=True,
inner="quart")
sns.despine(left=True)
%%time
# Draw a nested violinplot and split the violins for easier comparison
sns.violinplot(x="SEP", y="SALARY", data=SampledOPMDataProf, split=True,
inner="quart")
sns.despine(left=True)
#%%time
#
#sns.factorplot(x="SEP", y="SALARY", col="PATCOT",
# data=SampledOPMDataProf,
# kind="violin", split=True, aspect=.5, size=15);
#%%time
#
#sns.factorplot(x="SEP", y="SALARY", col="PATCOT", data=SampledOPMDataProf,
# kind="violin", split=True, aspect=.4, size=10);
%%time
g = sns.PairGrid(data=SampledOPMDataProf,
x_vars=["SEP","PATCOT"],
y_vars=["SALARY", "LOS", "LowerLimitAge", "YearsToRetirement"],
aspect=1, size=10)
g.map(sns.violinplot, palette="pastel", inner="quart");
Now that we have the dataset sampled, we still have some legwork necessary to convert our categorical attributes into binary integer values. Below we walk through this process for the following Attributes:
Once these attributes have been encoded and description columns removed, we end up with a total of 2446 attributes in our dataset for analysis in our model generation.
# Clean up old objects no longer needed, to clear up memory
process = psutil.Process(os.getpid())
print("Memory Usage before Cleanup: ", process.memory_info().rss)
if 'AGELVL' in dir():
del AGELVL
if 'AggIndAvgSalary' in dir():
del AggIndAvgSalary
if 'AggIndAvgSalary2' in dir():
del AggIndAvgSalary2
if 'AggSEPCount_EFDATE_LOC' in dir():
del AggSEPCount_EFDATE_LOC
if 'AggSEPCount_EFDATE_OCC' in dir():
del AggSEPCount_EFDATE_OCC
if 'AggStrat' in dir():
del AggStrat
if 'DATECODE' in dir():
del DATECODE
if 'EMPColList' in dir():
del EMPColList
if 'EMPDataOrig4Q' in dir():
del EMPDataOrig4Q
if 'maxSize' in dir():
del maxSize
if 'OPMColList' in dir():
del OPMColList
if 'OPMDataFiles' in dir():
del OPMDataFiles
if 'OPMDataList' in dir():
del OPMDataList
if 'OPMDataMerged' in dir():
del OPMDataMerged
if 'OPMDataOrig' in dir():
del OPMDataOrig
if 'SEP' in dir():
del SEP
if 'SampleSize' in dir():
del SampleSize
if 'SampledOPMStratumData' in dir():
del SampledOPMStratumData
if 'SampledOPMStratumDataList' in dir():
del SampledOPMStratumDataList
if 'StratCountSample' in dir():
del StratCountSample
if 'StratSampleSize' in dir():
del StratSampleSize
if 'JTL' in dir():
del JTL
process = psutil.Process(os.getpid())
print("Memory Usage after Cleanup: ", process.memory_info().rss)
display(SampledOPMDataProf.head())
SampledOPMDataProf.info()
%%time
if os.path.isfile(PickleJarPath+"/OPMAnalysisDataNoFam.pkl"):
print("Found the File! Loading Pickle Now!")
OPMAnalysisDataNoFam = unpickleObject("OPMAnalysisDataNoFam")
else:
OPMAnalysisDataNoFam = SampledOPMDataProf.copy()
cols = ["GENDER",
"DATECODE",
"QTR",
"COUNT",
"AGYTYPT",
"AGYT",
"AGYSUB",
"AGYSUBT",
"QTR",
"AGELVLT",
"LOSLVL",
"LOSLVLT",
"LOCTYPT",
"LOCT",
"OCCTYP",
"OCCTYPT",
"OCCFAM",
"OCCFAMT",
"OCC",
"OCCT",
"PATCO",
"PPGRD",
"PATCOT",
"PPTYPT",
"PPGROUPT",
"PAYPLAN",
"PAYPLANT",
"SALLVLT",
"TOATYPT",
"TOAT",
"WSTYP",
"WSTYPT",
"WORKSCH",
"WORKSCHT",
"SALARY",
"LOS",
"SEPCount_EFDATE_OCC",
"SEPCount_EFDATE_LOC"
]
#delete cols from analysis data
for col in cols:
if col in list(OPMAnalysisDataNoFam.columns):
del OPMAnalysisDataNoFam[col]
OPMAnalysisDataNoFam.info()
cols = ["AGELVL",
"LOC",
"SALLVL",
"TOA",
"AGYTYP",
"AGY",
"LOCTYP",
"PPTYP",
"PPGROUP",
"TOATYP"
]
#Split Values for cols
for col in cols:
if col in list(OPMAnalysisDataNoFam.columns):
AttSplit = pd.get_dummies(OPMAnalysisDataNoFam[col],prefix=col)
display(AttSplit.head())
OPMAnalysisDataNoFam = pd.concat((OPMAnalysisDataNoFam,AttSplit),axis=1) # add back into the dataframe
del OPMAnalysisDataNoFam[col]
pickleObject(OPMAnalysisDataNoFam, "OPMAnalysisData")
display(OPMAnalysisDataNoFam.head())
print("Number of Columns: ",len(OPMAnalysisDataNoFam.columns))
OPMAnalysisDataNoFam.info()
Below is a display of all remaining attributes and their corresponding data types for analysis
%%time
data_type = []
for idx, col in enumerate(OPMAnalysisDataNoFam.columns):
data_type.append(OPMAnalysisDataNoFam.dtypes[idx])
summary_df = {'Attribute Name' : pd.Series(OPMAnalysisDataNoFam.columns, index = range(len(OPMAnalysisDataNoFam.columns))), 'Data Type' : pd.Series(data_type, index = range(len(OPMAnalysisDataNoFam.columns)))}
summary_df = pd.DataFrame(summary_df)
display(summary_df)
del data_type, summary_df
We also scale the data values to remove bias in our models due to different attribute scales. Without scaling the data, attributes such as SALARY and LOS would carry heavier weights when compared against the binary encoded attributes and BLS data. This would cause unbalanced and improperly analyzed data for model creation.
OPMScaledAnalysisData = OPMAnalysisDataNoFam.copy()
del OPMScaledAnalysisData["SEP"]
%%time
OPMAnalysisScalerFit = MinMaxScaler().fit(OPMScaledAnalysisData)
## Pickle for later re-use if needed
pickleObject(OPMAnalysisScalerFit, "OPMAnalysisScalerFit")
OPMScaledAnalysisData = pd.DataFrame(OPMAnalysisScalerFit.transform(OPMScaledAnalysisData), columns = OPMScaledAnalysisData.columns)
display(OPMScaledAnalysisData.head())
Our objective, is to reduce dimensionality through identification of principal components. We have chosen 100 as the maximum number of components to be produced, given our hopes are to reduce the number of attributes needed for a model. We will review each component's explained variance further to determine the proper number of components to be included later during model generation. Note randomized PCA was chosen in order to use singular value decomposition in our dimensionality reduction efforts due to the large size of our data set.
%%time
seed = len(OPMScaledAnalysisData)
print(OPMScaledAnalysisData.shape)
pca_class = PCA(n_components=len(OPMScaledAnalysisData.columns), svd_solver='randomized', random_state=seed)
pca_class.fit(OPMScaledAnalysisData)
Below, the resulting components have been ordered by eigenvector value and these values portrayed as ratios of variance explained by each component. In order to identify the principal components to be included during model generation, we review the rate at which explained variance decreases in significance from one principal component to the next. Accompanying these proportion values is a scree plot representing these same values in visual form. By plotting the scree plot, it is easier to judge where this rate of decreasing explained variance occurs. Note the rate of change in explained variance among the first 8 principal components, with another less significant change through the 22th component. After the 22th component, the rate of decreasing explained variance begins to somewhat flatten out.
%%time
#The amount of variance that each PC explains
var= pca_class.explained_variance_ratio_
sns.set(font_scale=1)
plt.plot(range(1,len(OPMScaledAnalysisData.columns)+1), var*100, marker = '.', color = 'red', markerfacecolor = 'black')
plt.xlabel('Principal Components')
plt.ylabel('Percentage of Explained Variance')
plt.title('Scree Plot')
plt.axis([0, len(OPMScaledAnalysisData.columns)+1, -0.1, 9])
np.set_printoptions(suppress=True)
print(np.round(var, decimals=4)*100)
By now referring to the cumulative variance values and associated plot below, it may be seen that the cumulative variance increases in a fairly consistent parabola curve. In attempts to acheive a cumulative variance explained of greater than 80%, we end at 22 principal components. For this reason, 22 principal components may be selected as being the most appropriate for separation classification modeling given the variables among these data.
#Cumulative Variance explains
var1=np.cumsum(np.round(pca_class.explained_variance_ratio_, decimals=4)*100)
plt.plot(range(1,len(OPMScaledAnalysisData.columns)+1), var1, marker = '.', color = 'green', markerfacecolor = 'black')
plt.xlabel('Principal Components')
plt.ylabel('Explained Variance (Sum %)')
plt.title('Cumulative Variance Plot')
plt.axis([0, len(OPMScaledAnalysisData.columns)+1, 10, len(OPMScaledAnalysisData.columns)+1])
print(var1)
We proceed to analyze the first 4 component Feature Loadings more carefully. See below, plots of the top 10 loadings for each component.
plt.rcParams['figure.figsize'] = (20, 12)
fig = plt.figure()
for i in range(0,4):
components = pd.Series(pca_class.components_[i], index=OPMScaledAnalysisData.columns)
maxcomponent = pd.Series(pd.DataFrame(abs(components).sort_values(ascending=False).head(10)).index)
matplotlib.rc('xtick', labelsize=8)
ax = fig.add_subplot(2,2,i + 1)
weightsplot = pd.Series(components, index=maxcomponent)
weightsplot.plot(title = "Principal Component "+ str(i+1), kind='bar', color = 'Tomato', ax = ax)
plt.tight_layout()
plt.show()
MaxPC = 22
PCList = []
for i in range(0,MaxPC):
components = pd.Series(pca_class.components_[i], index=OPMScaledAnalysisData.columns)
maxcomponent = pd.Series(pd.DataFrame(abs(components).sort_values(ascending=False).head(15)).index)
PCList.append(maxcomponent)
PCList = pd.concat(PCList).drop_duplicates().sort_values(ascending=True).reset_index(drop = True)
print(PCList)
PCList = list(PCList)
Total of 48 features of the original 98 are identified, by taking the top 15 feature loadings within the first 22 components as determined above as the appropriate components to maximize variance explained. We may now, optionally utilize these 48 features identified, or utilize principal component vectors for analysis in the next steps.
Due to the unproportional number of observations in each separation type in our dataset, we need to create weightings. using SciKit's class_weight algorithm, we compute an array of weights to be used downstream in our models.
OPMClassWeights = class_weight.compute_class_weight("balanced", OPMAnalysisDataNoFam["SEP"].drop_duplicates(), OPMAnalysisDataNoFam["SEP"])
display(stratumProf)
display(pd.DataFrame({"Weight": OPMClassWeights, "SEP": OPMAnalysisDataNoFam["SEP"].drop_duplicates()}))
We have chosen to utilize Stratified KFold Cross Validation for our classification analysis, with 5 folds. This means, that from our original sample size of 16,638, each "fold" will save off approximately 20% as test observations utilizing the rest as training observations all while keeping the ratio of classes equal amongst customers and subscribers. This process will occur through 5 iterations, or folds, to allow us to cross validate our results amongst different test/train combinations. We have utilized a random_state seed equal to the length of the original sampled dataset to ensure reproducible results.
seed = len(OPMAnalysisDataNoFam)
cv = StratifiedKFold(n_splits = 5, random_state = seed)
print(OPMAnalysisDataNoFam.shape)
print(cv)
Max Depth The maximum depth (levels) in the tree. When a value is set, the tree may not split further once this level has been met regardless of how many nodes are in the leaf.
Max Features Number of features to consider when looking for a split.
Minimum Samples in Leaf Minimum number of samples required to be in a leaf node. Splits may not occur which cause the number of samples in a leaf to be less than this value. Too low a value here leads to overfitting the tree to train data.
Minimum Samples to Split Minimum number fo samples required to split a node. Care was taken during parameter tests to keep the ratio between Min Samples in Leaf and Min Samples to Split equal to that of the default values (1:2). This was done to allow an even 50/50 split on nodes which match the lowest granularity split criteria. similar to the min samples in leaf, too low a value here leads to overfitting the tree to train data.
n_estimators Number of Trees generated in the forest. Increasing the number of trees, in our models increased accuracy while decreasing performance. We tuned to provide output that completed all 10 iterations in under 10 minutes.
| max_depth | max_features | min_samples_leaf | min_samples_split | n_estimators |
|---|---|---|---|---|
| TBD | TBD | TBD | TBD | TBD |
%%time
def rfc_explor(n_estimators,
max_features,
max_depth,
min_samples_split,
min_samples_leaf,
Data = OPMAnalysisDataNoFam,
cols = PCList,
cv = cv,
seed = seed):
startTime = datetime.now()
y = Data["SEP"].values # get the labels we want
X = Data[cols].as_matrix()
rfc_clf = RandomForestClassifier(n_estimators=n_estimators, max_features = max_features, max_depth=max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, class_weight = "balanced", n_jobs=-1, random_state = seed) # get object
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('CLF',rfc_clf)]
)
accuracy = cross_val_score(clf_pipe, X, y, cv=cv.split(X, y)) # this also can help with parallelism
MeanAccuracy = sum(accuracy)/len(accuracy)
accuracy = np.append(accuracy, MeanAccuracy)
endTime = datetime.now()
TotalTime = endTime - startTime
accuracy = np.append(accuracy, TotalTime)
#print(TotalTime)
#print(accuracy)
return accuracy
%%time
def rfc_explor_w_PCA(n_estimators,
max_features,
max_depth,
min_samples_split,
min_samples_leaf,
PCA,
Data = OPMAnalysisDataNoFam,
cv = cv,
seed = seed):
startTime = datetime.now()
y = Data["SEP"].values # get the labels we want
X = Data.drop("SEP", axis=1).as_matrix()
rfc_clf = RandomForestClassifier(n_estimators=n_estimators, max_features = max_features, max_depth=max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, class_weight = "balanced", n_jobs=-1, random_state = seed) # get object
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('PCA', PCA),
('CLF',rfc_clf)]
)
accuracy = cross_val_score(clf_pipe, X, y, cv=cv.split(X, y)) # this also can help with parallelism
MeanAccuracy = sum(accuracy)/len(accuracy)
accuracy = np.append(accuracy, MeanAccuracy)
endTime = datetime.now()
TotalTime = endTime - startTime
accuracy = np.append(accuracy, TotalTime)
#print(TotalTime)
#print(accuracy)
return accuracy
%%time
acclist = []
fullColumns = list(OPMAnalysisDataNoFam.columns)
for i in fullColumns:
if i == "SEP": fullColumns.remove(i)
n_estimators = [10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 5 , 15 ]
max_features = ['auto', 'auto' , 'auto', 'auto', 'auto', 'auto', 'auto', 14 , 14 , 14 , 14 , 14 , 14 ]
max_depth = [None , None , None , None , None , None , None , None , 1000 , 500 , 100 , 1000 , 1000 ]
min_samples_split = [2 , 8 , 12 , 16 , 20 , 50 , 80 , 50 , 50 , 50 , 50 , 50 , 50 ]
min_samples_leaf = [1 , 4 , 6 , 8 , 10 , 25 , 40 , 25 , 25 , 25 , 25 , 25 , 25 ]
##Model with all Raw Scaled Features
for i in range(0,len(n_estimators)):
acclist.append(rfc_explor(n_estimators = n_estimators[i],
max_features = max_features[i],
max_depth = max_depth[i],
min_samples_split = min_samples_split[i],
min_samples_leaf = min_samples_leaf[i],
cols = fullColumns
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({ "ModelVersion": "All Raw Features",
"n_estimators": n_estimators,
"max_features": max_features,
"max_depth": max_depth,
"min_samples_split": min_samples_split,
"min_samples_leaf": min_samples_leaf
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion', 'max_depth', 'max_features', 'min_samples_leaf','min_samples_split', 'n_estimators', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
del rfcdf, acclist
acclist = []
## Model with only top 15 raw Scaled Principal Features
for i in range(0,len(n_estimators)):
acclist.append(rfc_explor(n_estimators = n_estimators[i],
max_features = max_features[i],
max_depth = max_depth[i],
min_samples_split = min_samples_split[i],
min_samples_leaf = min_samples_leaf[i]
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({ "ModelVersion": "Top 15 Raw from PC",
"n_estimators": n_estimators,
"max_features": max_features,
"max_depth": max_depth,
"min_samples_split": min_samples_split,
"min_samples_leaf": min_samples_leaf
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion', 'max_depth', 'max_features', 'min_samples_leaf','min_samples_split', 'n_estimators', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
del rfcdf, acclist
### Model with PCA
acclist = []
for i in range(0,len(n_estimators)):
acclist.append(rfc_explor_w_PCA(n_estimators = n_estimators[i],
max_features = max_features[i],
max_depth = max_depth[i],
min_samples_split = min_samples_split[i],
min_samples_leaf = min_samples_leaf[i],
PCA = PCA(n_components=22, svd_solver='randomized', random_state = seed)
)
)
rfcdf = pd.DataFrame(pd.concat([pd.DataFrame({ "ModelVersion": "With PCA",
"n_estimators": n_estimators,
"max_features": max_features,
"max_depth": max_depth,
"min_samples_split": min_samples_split,
"min_samples_leaf": min_samples_leaf
}),
pd.DataFrame(acclist)], axis = 1).reindex())
rfcdf.columns = ['ModelVersion', 'max_depth', 'max_features', 'min_samples_leaf','min_samples_split', 'n_estimators', 'Iteration 0', 'Iteration 1', 'Iteration 2', 'Iteration 3', 'Iteration 4', 'MeanAccuracy', 'RunTime']
display(rfcdf)
#'Iteration 5', 'Iteration 6', 'Iteration 7', 'Iteration 8', 'Iteration 9',
We have created a function to be re-used for our cross-validation Accuracy Scores. Inputs of PCA components, Model CLF object, original sample data, and a CV containing our test/train splits allow us to easily produce an array of Accuracy Scores for the different permutations of models tested. A XXXXXXTBDXXXXX plot is also displayed depicting a view of the misclassification values for each iteration. Finally, a confusion matrix is displayed for the last test/train iteration for further interpretation on results.
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.rcParams['figure.figsize'] = (18, 6)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, round(cm[i, j],2),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
%%time
def compute_kfold_scores_Classification( clf,
Data = OPMAnalysisDataNoFam,
cols = PCList,
cv = cv):
y = Data["SEP"].values # get the labels we want
y = np.where(y == 'NS', 0,
np.where(y == 'SA', 1,
np.where(y == 'SC', 2,
np.where(y == 'SD', 3,
np.where(y == 'SH', 4,
5
)
)
)
)
)
X = Data[cols].as_matrix()
# Run classifier with cross-validation and plot ROC curves
# setup pipeline to take PCA, then fit a clf model
clf_pipe = Pipeline(
[('minMaxScaler', MinMaxScaler()),
('CLF',clf)]
)
colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange', 'pink', 'darkred', 'dimgray', 'maroon', 'coral'])
accuracy = []
#logloss = []
for (train, test), color in zip(cv.split(X, y), colors):
clf_pipe.fit(X[train],y[train]) # train object
y_hat = clf_pipe.predict(X[test]) # get test set preditions
a = float(mt.accuracy_score(y[test],y_hat))
#l = float(mt.log_loss(y[test], y_hat))
accuracy.append(round(a,5))
#logloss.append(round(l,5))
#print("Accuracy Ratings across all iterations: {0}\n\n\
#Average Accuracy: {1}\n\n\
#Log Loss Values across all iterations: {2}\n\n\
#Average Log Loss: {3}\n".format(accuracy, round(sum(accuracy)/len(accuracy),5), logloss,round(sum(logloss)/len(logloss),5)))
print("Accuracy Ratings across all iterations: {0}\n\n\
Average Accuracy: {1}\n".format(accuracy, round(sum(accuracy)/len(accuracy),5)))
ytestnames = np.where(y[test] == 0,'NS',
np.where(y[test] == 1,'SA',
np.where(y[test] == 2,'SC',
np.where(y[test] == 3,'SD',
np.where(y[test] == 4,'SH',
'SI'
)
)
)
)
)
yhatnames = np.where(y_hat == 0,'NS',
np.where(y_hat == 1,'SA',
np.where(y_hat == 2,'SC',
np.where(y_hat == 3,'SD',
np.where(y_hat == 4,'SH',
'SI'
)
)
)
)
)
#print(set(list(y_hat)))
print("confusion matrix\n{0}\n".format(pd.crosstab(ytestnames, yhatnames, rownames = ['True'], colnames = ['Predicted'], margins = True)))
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(confusion_matrix(y[test], y_hat),
classes =["NS", "SA", "SC", "SD", "SH", "SI"],
normalize =True,
title ='Confusion matrix, with normalization')
return clf_pipe.named_steps['CLF'], accuracy
%%time
rfc_clf = RandomForestClassifier(n_estimators = 15,
max_features = 14,
max_depth = 1000.0,
min_samples_split = 50,
min_samples_leaf = 25,
class_weight = "balanced",
n_jobs = -1,
random_state = seed) # get object
rfc_clf, rfc_acc = compute_kfold_scores_Classification(rfc_clf, cols = fullColumns)